import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
import tensorflow as tf
from tensorflow import keras
import umap.umap_ as umap
%config InlineBackend.figure_format = 'svg'
np.random.seed(42)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
data=pd.read_csv('Preprocessed_DM_xx.csv')
np.random.seed(42)
data=data.sample(frac=1) #Shuffle the data set
np.random.seed(42)
HTN_indexes=data.loc[(data['Currently.taking.a.prescribed.medicine.to.lower.BP'] != 0) | (data['First.SYSTOLIC.reading'] >= 140) | (data['First.DIASTOLIC.reading'] >= 90) ].index.values
HTN_cols=np.zeros(data.shape[0])
HTN_cols[[HTN_indexes]]=1
data['HTN']=HTN_cols
data=data.drop(["First.SYSTOLIC.reading","First.DIASTOLIC.reading","Currently.taking.a.prescribed.medicine.to.lower.BP"], axis=1)
data=data.reset_index(drop=True)
data.columns
data=data.drop(["Hb_adjust_alt_smok","Second.SYSTOLIC.reading","Second.DIASTOLIC.reading","Third.SYSTOLIC.reading","Third.DIASTOLIC.reading","Hb_status","Glucose.level",'SBP_status'], axis=1)
data=data.loc[data['BMI'] != 99.99]
data=data.loc[data['Hemoglobin.level..g.dl...1.decimal.'] != 99.99]
data=data.loc[data['Currently.has.asthma'] != .5]
data=data.loc[data['Currently.has.thyroid.disorder'] != .5]
data=data.loc[data['Currently.has.heart.disease'] != .5]
data=data.loc[data['Currently.has.cancer'] != .5]
data=data.loc[data['DM_history'] == 1]
data=data.loc[data['Type.of.caste.or.tribe.of.the.household.head'] != 0]
data=data.loc[data['Time.to.get.to.water.source..minutes.'] != -1]
data=data.drop(["Unnamed: 0","DM_status","DM_history"], axis=1)
np.random.seed(42)
i=[x for x in range(10125)]
data.set_index(pd.Series(i), inplace=True) # Reset the index
data.shape
(10125, 41)
from fdc.fdc import feature_clustering
from fdc.fdc import canberra_modified
modified_can = canberra_modified
from fdc.fdc import FDC, Clustering
ord_list=['Drinks.alcohol', 'Smoking_stat','Has.refrigerator',
'Has.bicycle', 'Has.motorcycle.scooter', 'Has.car.truck', 'Owns.livestock..herds.or.farm.animals','Frequency.takes.milk.or.curd',
'Frequency.eats.pulses.or.beans',
'Frequency.eats.dark.green.leafy.vegetable', 'Frequency.eats.fruits',
'Frequency.eats.eggs', 'Frequency.eats.fish',
'Frequency.eats.chicken.or.meat', 'Frequency.eats.fried.food',
'Frequency.takes.aerated.drinks','Frequency.household.members.smoke.inside.the.house','Wealth.index',
'Highest.educational.level','Currently.has.asthma',
'Currently.has.thyroid.disorder', 'Currently.has.heart.disease',
'Currently.has.cancer', 'Suffers.from.TB','HTN' ]
cont_list=['Current.age','BMI','Hemoglobin.level..g.dl...1.decimal.','Time.to.get.to.water.source..minutes.']
nom_list=['Household.head.s.religion', 'Sex', 'Type.of.place.of.residence', 'Household.structure',
'Type.of.caste.or.tribe.of.the.household.head','Type.of.cooking.fuel','Source.of.drinking.water']
umap_emb=feature_clustering(15,0.1,'euclidean',data,True)
from fdc.clustering import Clustering
umap_clustering=Clustering(umap_emb,umap_emb,True)
umap_cluster_list,umap_cluster_counts=umap_clustering.K_means(2)
from sklearn import metrics
from sklearn.metrics import pairwise_distances
from sklearn.metrics import silhouette_score
from cluster_val import *
silhouette_score(umap_emb, umap_cluster_list, metric='euclidean')
0.5998863206501435
Silhouette_visual(umap_emb)
elbow_plot(umap_emb)
dunn_index(cluster_wise_df(umap_emb,umap_cluster_list))
0.00287162771413835
umap_cluster_list_agglo,umap_cluster_counts_agglo=umap_clustering.Agglomerative(2,'euclidean','ward')
silhouette_score(umap_emb, umap_cluster_list_agglo, metric='euclidean')
0.6148981607281019
dunn_index(cluster_wise_df(umap_emb,umap_cluster_list_agglo))
0.2598093251538023
umap_cluster_list_dbscan,umap_cluster_counts_dbscan=umap_clustering.DBSCAN(0.8,160)
#removing noise indices from the embeddings
non_noise_indices= np.where(np.array(umap_cluster_list_dbscan)!=-1)
umap_emb= umap_emb.iloc[non_noise_indices]
#FDC_emb_low= FDC_emb_low.iloc[non_noise_indices]
umap_cluster_list_dbscan= np.array(umap_cluster_list_dbscan)[non_noise_indices]
silhouette_score(umap_emb, umap_cluster_list_dbscan, metric='euclidean')
0.6153916171327859
dunn_index(cluster_wise_df(umap_emb,umap_cluster_list_dbscan))
0.2598093251538023
ord_list=['Drinks.alcohol', 'Smoking_stat','Has.refrigerator',
'Has.bicycle', 'Has.motorcycle.scooter', 'Has.car.truck', 'Owns.livestock..herds.or.farm.animals','Frequency.takes.milk.or.curd',
'Frequency.eats.pulses.or.beans',
'Frequency.eats.dark.green.leafy.vegetable', 'Frequency.eats.fruits',
'Frequency.eats.eggs', 'Frequency.eats.fish',
'Frequency.eats.chicken.or.meat', 'Frequency.eats.fried.food',
'Frequency.takes.aerated.drinks','Frequency.household.members.smoke.inside.the.house','Wealth.index',
'Highest.educational.level','Currently.has.asthma',
'Currently.has.thyroid.disorder', 'Currently.has.heart.disease',
'Currently.has.cancer', 'Suffers.from.TB','HTN' ]
cont_list=['Current.age','BMI','Hemoglobin.level..g.dl...1.decimal.','Time.to.get.to.water.source..minutes.']
nom_list=['Household.head.s.religion', 'Sex', 'Type.of.place.of.residence', 'Household.structure',
'Type.of.caste.or.tribe.of.the.household.head','Type.of.cooking.fuel','Source.of.drinking.water']
len(ord_list)
25
len(nom_list)
7
len(cont_list)
4
from fdc.fdc import feature_clustering
from fdc.fdc import FDC, Clustering
from fdc.fdc import canberra_modified
modified_can = canberra_modified
fdc = FDC(clustering_cont=Clustering('euclidean')
, clustering_ord=Clustering(modified_can)
, clustering_nom=Clustering('hamming', max_components=1)
, visual=True
, use_pandas_output=True
, with_2d_embedding=True
)
fdc.selectFeatures(continueous=cont_list, nomial=nom_list, ordinal=ord_list)
FDC_emb_high,FDC_emb_low = fdc.normalize(data,n_neighbors=15, min_dist=0.1,cont_list=cont_list, nom_list=nom_list, ord_list=ord_list,
with_2d_embedding=True,
visual=True)
FDC.normalize (init): 0.00000 / 0.000s FDC.normalize (clustering CONT): 7.06250 / 7.062s FDC.normalize (clustering ORD): 80.18750 / 87.250s FDC.normalize (clustering NOM): 63.71875 / 150.969s FDC.normalize (concat): 0.00000 / 150.969s FDC.normalize (umap 5 -> 2): 7.68750 / 158.656s FDC.normalize (array -> DataFrame): 0.00000 / 158.656s
FDC.normalize (plotting): 1.45312 / 160.109s FDC.normalize (array -> DataFrame): 0.00000 / 160.109s FDC.normalize (total): 0.00000 / 160.109s
from fdc.clustering import Clustering
clustering=Clustering(FDC_emb_high,FDC_emb_low,True)
cluster_list,cluster_counts=clustering.K_means(4)
FDC_emb_high['Cluster'] = cluster_list
silhouette_score(FDC_emb_high, cluster_list, metric='euclidean')
0.3288029395347771
dunn_index(cluster_wise_df(FDC_emb_high,cluster_list))
0.025345964140981916
elbow_plot(FDC_emb_high)
Silhouette_visual(FDC_emb_high)
cluster_list_agglo,cluster_counts_agglo=clustering.Agglomerative(4,'euclidean','ward')
FDC_emb_high['Cluster'] = cluster_list_agglo
silhouette_score(FDC_emb_high, cluster_list_agglo, metric='euclidean')
0.3380953282602711
dunn_index(cluster_wise_df(FDC_emb_high,cluster_list_agglo))
0.02831773013366899
cluster_list_dbscan,cluster_counts_dbscan=clustering.DBSCAN(1.2,150)
FDC_emb_high['Cluster'] = cluster_list_dbscan
#removing noise indices from the embeddings
non_noise_indices= np.where(np.array(cluster_list_dbscan)!=-1)
FDC_emb_high= FDC_emb_high.iloc[non_noise_indices]
FDC_emb_low= FDC_emb_low.iloc[non_noise_indices]
cluster_list_dbscan= np.array(cluster_list_dbscan)[non_noise_indices]
silhouette_score(FDC_emb_high, cluster_list_dbscan, metric='euclidean')
0.23610559265550585
dunn_index(cluster_wise_df(FDC_emb_high,cluster_list_dbscan))
0.02337308233223535